{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Locally and Deploy to GCP\n",
    "The following notebook trains an XGBoost model locally and deploys the resulting model file to GCP using the ML Engine Online Prediction API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2019 Google Inc. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "\n",
    "# Update the following variables for your project\n",
    "PROJECT_ID   = '<project-id>'\n",
    "VERSION_DIR  = 'gs://<bucket-name>/<folder-name>/'\n",
    "MODEL_NAME   = '<model-name>'\n",
    "VERSION_NAME = '<version-name>'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define training and evaluation functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from xgboost import XGBRegressor\n",
    "import urllib.request\n",
    "try:\n",
    "    from sklearn.preprocessing import Imputer\n",
    "except ImportError:\n",
    "    from sklearn.impute import SimpleImputer as Imputer\n",
    "\n",
    "import logging\n",
    "logging.getLogger().setLevel(logging.INFO)\n",
    "\n",
    "TRAINING_URL=\"https://raw.githubusercontent.com/kubeflow/examples/master/xgboost_ames_housing/ames_dataset/train.csv\"\n",
    "TRAINING_FILE=\"train.csv\"\n",
    "\n",
    "ESTIMATORS=1000\n",
    "LEARNING_RATE=0.1\n",
    "TEST_FRACTION_SIZE=0.25\n",
    "EARLY_STOPPING_ROUNDS=50\n",
    "\n",
    "def run_training_and_eval():\n",
    "    (train_X, train_y), (test_X, test_y) = read_input()\n",
    "    model = train_model(train_X,\n",
    "                        train_y,\n",
    "                        test_X,\n",
    "                        test_y,\n",
    "                        ESTIMATORS,\n",
    "                        LEARNING_RATE)\n",
    "\n",
    "    eval_model(model, test_X, test_y)\n",
    "\n",
    "def download(url, file_name):\n",
    "    with urllib.request.urlopen(url) as response, open(file_name, \"wb\") as file:\n",
    "        file.write(response.read())\n",
    "\n",
    "def read_input(test_size=TEST_FRACTION_SIZE):\n",
    "    \"\"\"Read input data and split it into train and test.\"\"\"\n",
    "    download(TRAINING_URL, TRAINING_FILE)\n",
    "    data = pd.read_csv(TRAINING_FILE)\n",
    "    data.dropna(axis=0, subset=['SalePrice'], inplace=True)\n",
    "\n",
    "    y = data.SalePrice\n",
    "    X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])\n",
    "\n",
    "    train_X, test_X, train_y, test_y = train_test_split(X.values,\n",
    "                                                        y.values,\n",
    "                                                        test_size=test_size,\n",
    "                                                        shuffle=False)\n",
    "\n",
    "    imputer = Imputer()\n",
    "    train_X = imputer.fit_transform(train_X)\n",
    "    test_X = imputer.transform(test_X)\n",
    "\n",
    "    return (train_X, train_y), (test_X, test_y)\n",
    "\n",
    "def train_model(train_X,\n",
    "                train_y,\n",
    "                test_X,\n",
    "                test_y,\n",
    "                n_estimators,\n",
    "                learning_rate):\n",
    "    \"\"\"Train the model using XGBRegressor.\"\"\"\n",
    "    model = XGBRegressor(n_estimators=n_estimators,\n",
    "                      learning_rate=learning_rate)\n",
    "\n",
    "    model.fit(train_X,\n",
    "              train_y,\n",
    "              early_stopping_rounds=EARLY_STOPPING_ROUNDS,\n",
    "              eval_set=[(test_X, test_y)])\n",
    "\n",
    "    logging.info(\"Best RMSE on eval: %.2f with %d rounds\",\n",
    "                 model.best_score,\n",
    "                 model.best_iteration+1)\n",
    "    return model\n",
    "\n",
    "def eval_model(model, test_X, test_y):\n",
    "    \"\"\"Evaluate the model performance.\"\"\"\n",
    "    predictions = model.predict(test_X)\n",
    "    logging.info(\"mean_absolute_error=%.2f\", mean_absolute_error(predictions, test_y))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train and evaluate the model locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(train_X, train_y), (test_X, test_y) = read_input()\n",
    "model = train_model(train_X,\n",
    "                        train_y,\n",
    "                        test_X,\n",
    "                        test_y,\n",
    "                        ESTIMATORS,\n",
    "                        LEARNING_RATE)\n",
    "\n",
    "eval_model(model, test_X, test_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export the model using the joblib library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "joblib.dump(model, 'model.joblib')\n",
    "!gsutil cp model.joblib {VERSION_DIR}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Deploy the model to GCP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from kubeflow.fairing.deployers.gcp.gcpserving import GCPServingDeployer\n",
    "deployer = GCPServingDeployer()\n",
    "deployer.deploy(VERSION_DIR, MODEL_NAME, VERSION_NAME)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Send a prediction to the deployed model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from googleapiclient import discovery\n",
    "ml = discovery.build('ml', 'v1')\n",
    "\n",
    "resource_name = 'projects/{}/models/{}/versions/{}'.format(PROJECT_ID, MODEL_NAME, VERSION_NAME)\n",
    "ml.projects().predict(\n",
    "    name=resource_name,\n",
    "    body={\n",
    "        'instances': [\n",
    "            [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]\n",
    "        ]\n",
    "    }\n",
    ").execute()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
